Slip 17


Q.1. Implement Ensemble ML algorithm on Pima Indians Diabetes Database with bagging 
(random forest), boosting, voting and Stacking methods and display analysis 
accordingly. Compare result. 

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
cols = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
        "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]

df = pd.read_csv(url, names=cols)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Store results
results = {}

# ========== Random Forest (Bagging) ==========
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
results["Random Forest"] = accuracy_score(y_test, y_pred)
print("==== Random Forest (Bagging) =====")
print("Accuracy:", results["Random Forest"])
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ========== AdaBoost (Boosting) ==========
ada = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=42)  # fixed warning
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
results["AdaBoost"] = accuracy_score(y_test, y_pred)
print("\n===== AdaBoost (Boosting) =====")
print("Accuracy:", results["AdaBoost"])
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ========== Gradient Boosting ==========
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
results["Gradient Boosting"] = accuracy_score(y_test, y_pred)
print("\n===== Gradient Boosting =====")
print("Accuracy:", results["Gradient Boosting"])
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ========== Voting Ensemble ==========
voting = VotingClassifier(
    estimators=[('rf', rf), ('ada', ada), ('gb', gb)],
    voting='hard'
)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
results["Voting Ensemble"] = accuracy_score(y_test, y_pred)
print("\n===== Voting Ensemble =====")
print("Accuracy:", results["Voting Ensemble"])
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ========== Stacking Ensemble ==========
stacking = StackingClassifier(
    estimators=[('rf', rf), ('ada', ada), ('gb', gb)],
    final_estimator=LogisticRegression(),
    passthrough=False
)
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)
results["Stacking Ensemble"] = accuracy_score(y_test, y_pred)
print("\n===== Stacking Ensemble =====")
print("Accuracy:", results["Stacking Ensemble"])
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ===========================================
# Plot Accuracy Comparison
# ===========================================
plt.figure(figsize=(8,5))
plt.bar(results.keys(), results.values(), color="skyblue", edgecolor="black")
plt.ylabel("Accuracy")
plt.title("Ensemble Methods Accuracy Comparison")
plt.xticks(rotation=30)
plt.ylim(0.6, 0.85)
for i, v in enumerate(results.values()):
    plt.text(i, v + 0.005, f"{v:.2f}", ha="center")
plt.show()

Q.2. Write a python program to implement Multiple Linear Regression for a house price 
dataset.   

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
# You can replace 'house_prices.csv' with your actual dataset file
df = pd.read_csv("house_prices.csv")

print("✅ Dataset loaded successfully!\n")
print("First 5 rows:\n", df.head(), "\n")

# Step 2: Check for missing values
print("Null values in dataset:\n", df.isnull().sum(), "\n")

# (Optional) Remove rows with null values
df = df.dropna()

# Step 3: Define features (X) and target (y)
# Assuming dataset has columns like: 'Area', 'Bedrooms', 'Bathrooms', 'Price'
# Modify column names as per your dataset
X = df[['Area', 'Bedrooms', 'Bathrooms']]   # independent variables
y = df['Price']                             # dependent variable

# Step 4: Split the dataset into training and testing sets (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Create and train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("📊 Model Evaluation:")
print("Mean Squared Error:", round(mse, 2))
print("R² Score:", round(r2, 4))
print("\nIntercept (b0):", round(model.intercept_, 2))
print("Coefficients (b1, b2, b3...):")
for col, coef in zip(X.columns, model.coef_):
    print(f"  {col}: {round(coef, 2)}")

# Step 8: Compare actual vs predicted prices
comparison = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
print("\nComparison of Actual vs Predicted Prices:\n", comparison.head())

# Step 9: Visualize Actual vs Predicted values
plt.figure(figsize=(6,4))
sns.scatterplot(x=y_test, y=y_pred, color='blue')
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()

# Step 10: Example prediction for new input
# Example: Predict price for a house with given features
new_data = pd.DataFrame({'Area': [2500], 'Bedrooms': [4], 'Bathrooms': [3]})
predicted_price = model.predict(new_data)
print(f"\n🏠 Predicted price for new house = ${predicted_price[0]:.2f}")